{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 08 Interquartile range (IQR)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%html\n", "" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pandas import Series, DataFrame\n", "import matplotlib.pyplot as plt\n", "from scipy import stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[khanacademy](https://www.khanacademy.org/math/ap-statistics/summarizing-quantitative-data-ap/measuring-spread-quantitative/v/calculating-interquartile-range-iqr?modal=1)\n", "[pandas.DataFrame.quantile](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.quantile.html)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Interquartile range (IQR) fig 1](./imgs/03-08-01.png)![Interquartile range (IQR) fig 2](./imgs/03-08-02.png)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "x_data = {'x': [4,4,6,7,10,11,12,14,15]}" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "y_data = {'y': [7, 9, 9, 10, 10, 11, 12, 12, 14]}" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "x_df = DataFrame(x_data)\n", "y_df = DataFrame(y_data)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
x
count9.000000
mean9.222222
std4.146618
min4.000000
25%6.000000
50%10.000000
75%12.000000
max15.000000
\n", "
" ], "text/plain": [ " x\n", "count 9.000000\n", "mean 9.222222\n", "std 4.146618\n", "min 4.000000\n", "25% 6.000000\n", "50% 10.000000\n", "75% 12.000000\n", "max 15.000000" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_df.describe()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
y
count9.000000
mean10.444444
std2.068279
min7.000000
25%9.000000
50%10.000000
75%12.000000
max14.000000
\n", "
" ], "text/plain": [ " y\n", "count 9.000000\n", "mean 10.444444\n", "std 2.068279\n", "min 7.000000\n", "25% 9.000000\n", "50% 10.000000\n", "75% 12.000000\n", "max 14.000000" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_df.describe()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "x 12.0\n", "Name: 0.75, dtype: float64\n", "x 12\n", "Name: 0.75, dtype: int64\n", "y 12.0\n", "Name: 0.75, dtype: float64\n", "y 12\n", "Name: 0.75, dtype: int64\n" ] } ], "source": [ "print(x_df.quantile(q=0.75))\n", "print(x_df.quantile(q=0.75, interpolation='nearest'))\n", "\n", "print(y_df.quantile(q=0.75))\n", "print(y_df.quantile(q=0.75, interpolation='nearest'))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6.0\n", "3.0\n" ] } ], "source": [ "print(stats.iqr(x_df))\n", "print(stats.iqr(y_df))" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "x_df['Rank'] = x_df.index + 1\n", "x_df['Empirical_CDF'] = x_df['Rank'] / x_df.shape[0]\n", "x_q_25 = x_df.x[x_df['Empirical_CDF']>=0.25].reset_index(drop=True)[0]\n", "x_q_50 = x_df.x[x_df['Empirical_CDF']>=0.50].reset_index(drop=True)[0]\n", "x_q_75 = x_df.x[x_df['Empirical_CDF']>=0.75].reset_index(drop=True)[0]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "y_df['Rank'] = y_df.index + 1\n", "y_df['Empirical_CDF'] = y_df['Rank'] / y_df.shape[0]\n", "y_q_25 = y_df.y[y_df['Empirical_CDF']>=0.25].reset_index(drop=True)[0]\n", "y_q_50 = y_df.y[y_df['Empirical_CDF']>=0.50].reset_index(drop=True)[0]\n", "y_q_75 = y_df.y[y_df['Empirical_CDF']>=0.75].reset_index(drop=True)[0]" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "q 25 6 q 50 10 q 75 12 => iqr = 6\n", "q 25 9 q 50 10 q 75 12 => iqr = 3\n" ] } ], "source": [ "print(f'q 25 {x_q_25} q 50 {x_q_50} q 75 {x_q_75} => iqr = {x_q_75 - x_q_25}')\n", "print(f'q 25 {y_q_25} q 50 {y_q_50} q 75 {y_q_75} => iqr = {y_q_75 - y_q_25}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }